In [32]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['lines.markersize'] = 10
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

import seaborn as sns
# sns.set(style="white")
# sns.set(style="whitegrid", color_codes=True)
plt.style.use('ggplot')
import os
import glob


import warnings
warnings.filterwarnings('ignore')

Import Data

Zendesk Agg

In [2]:
path = '../../Data/Zendesk/'

df_tickets_agg = pd.DataFrame()
for f in glob.glob(path + "*.xlsx"):
    df = pd.read_excel(f)
    df_tickets_agg = df_tickets_agg.append(df,ignore_index=True)

Metabase data

In [3]:
df_checkins = (pd.read_csv('../../Data/Metabase/checkins.csv')
                 .filter(['person_id', 'count checkin'])
                 .rename(columns={'count checkin':'Count_checkin'})
              )
df_plans = pd.read_csv('../../Data/Metabase/plan_values.csv').rename(columns={'max': 'Plan_value'})
In [4]:
df_cancel_1 = pd.read_csv('../../Data/Metabase/cancelamentos_1.csv')
df_cancel_2 = pd.read_csv('../../Data/Metabase/cancelamentos_2.csv')
df_cancel = (pd.concat([df_cancel_1, df_cancel_2])
               .filter(['person_id', 'change_to_type']))

# creatin cancelation flag
df_cancel = (df_cancel.groupby('person_id').count()
                      .reset_index()
                      .sort_values('change_to_type', ascending=False)
                      .assign(Cancel=1))
In [5]:
df_agg = pd.merge(df_tickets_agg, df_plans, how='left', left_on='Requester_ID', right_on='person_id')
df_agg = pd.merge(df_agg, df_checkins, how='left', left_on='Requester_ID', right_on='person_id')
df_agg = pd.merge(df_agg, df_cancel, how='left', left_on='Requester_ID', right_on='person_id')
In [6]:
df_agg = df_agg.filter(['Requester_ID', 'Month', 'Volume_7days', 'Volume_15days',
                'Volume_Monthly', 'ReclameAqui', 'SocialMedia', 'Tempo_Medio_Chat',
                'Tempo_Medio_Email', 'AWT_Chat', '%NFCR', '%Insatisfação(CSAT)',
                'CSAT_Rated', 'Plan_value', 'Count_checkin', 'Cancel'])
In [7]:
df_agg.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 152534 entries, 0 to 152533
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Requester_ID         152534 non-null  int64  
 1   Month                152534 non-null  object 
 2   Volume_7days         152534 non-null  int64  
 3   Volume_15days        152534 non-null  int64  
 4   Volume_Monthly       152534 non-null  int64  
 5   ReclameAqui          152534 non-null  object 
 6   SocialMedia          152534 non-null  object 
 7   Tempo_Medio_Chat     152534 non-null  float64
 8   Tempo_Medio_Email    152028 non-null  float64
 9   AWT_Chat             152534 non-null  float64
 10  %NFCR                152534 non-null  float64
 11  %Insatisfação(CSAT)  152534 non-null  float64
 12  CSAT_Rated           152534 non-null  int64  
 13  Plan_value           96479 non-null   float64
 14  Count_checkin        83903 non-null   float64
 15  Cancel               20429 non-null   float64
dtypes: float64(8), int64(5), object(3)
memory usage: 19.8+ MB
In [8]:
df_agg.head(3)
Out[8]:
Requester_ID Month Volume_7days Volume_15days Volume_Monthly ReclameAqui SocialMedia Tempo_Medio_Chat Tempo_Medio_Email AWT_Chat %NFCR %Insatisfação(CSAT) CSAT_Rated Plan_value Count_checkin Cancel
0 10005693 2021_01 1 1 1 no no 0.0 8795.666667 0.0 0.0 0.0 0 NaN NaN NaN
1 10008091 2021_01 1 1 1 no no 0.0 2503.750000 0.0 0.0 0.0 0 NaN NaN NaN
2 10014513 2021_01 1 1 1 no no 0.0 12367.208333 0.0 0.0 0.0 0 NaN NaN NaN
In [9]:
df_agg.isnull().mean() * 100
Out[9]:
Requester_ID            0.000000
Month                   0.000000
Volume_7days            0.000000
Volume_15days           0.000000
Volume_Monthly          0.000000
ReclameAqui             0.000000
SocialMedia             0.000000
Tempo_Medio_Chat        0.000000
Tempo_Medio_Email       0.331729
AWT_Chat                0.000000
%NFCR                   0.000000
%Insatisfação(CSAT)     0.000000
CSAT_Rated              0.000000
Plan_value             36.749184
Count_checkin          44.993903
Cancel                 86.606920
dtype: float64
In [10]:
import missingno as msno
%matplotlib inline
msno.matrix(df_agg)
Out[10]:
<AxesSubplot:>

Data Manipulation

Data Preparation

In [11]:
# Every Volume Chat, E-mail, Social NaN is zero
df_agg.Tempo_Medio_Chat.loc[df_agg.Tempo_Medio_Chat.isna()] = 0
df_agg.Tempo_Medio_Email.loc[df_agg.Tempo_Medio_Email.isna()] = 0

# Assuming that checkin==NaN is everyone who has median checked in (can be mistake on matebase query)
df_agg['Count_checkin'].loc[df_agg['Count_checkin'].isna()] = df_agg.Count_checkin.median()

# Assuming that cancel is NaN is every one who has not tried to cancel (can be mistake on matebase query)
df_agg.Cancel.loc[df_agg.Cancel.isna()] = 0

df_agg.Plan_value[df_agg.Plan_value.isna()] = df_agg.Plan_value.median()

df_agg.AWT_Chat.loc[df_agg.AWT_Chat.isna()] = 0

Target Creation

In [12]:
df_agg.head(3)
Out[12]:
Requester_ID Month Volume_7days Volume_15days Volume_Monthly ReclameAqui SocialMedia Tempo_Medio_Chat Tempo_Medio_Email AWT_Chat %NFCR %Insatisfação(CSAT) CSAT_Rated Plan_value Count_checkin Cancel
0 10005693 2021_01 1 1 1 no no 0.0 8795.666667 0.0 0.0 0.0 0 119.9 15.0 0.0
1 10008091 2021_01 1 1 1 no no 0.0 2503.750000 0.0 0.0 0.0 0 119.9 15.0 0.0
2 10014513 2021_01 1 1 1 no no 0.0 12367.208333 0.0 0.0 0.0 0 119.9 15.0 0.0
In [13]:
df_agg['Target'] = 0

# motivo de contato pagamento
df_agg.Target.loc[#(df_agg.Volume_7days>3) |
                  #(df_agg.Volume_15days>3) |
                  (df_agg.Volume_Monthly>=4) |
                   (df_agg.ReclameAqui=='yes') 
                   #|(df_agg['%Insatisfação(CSAT)']==1) |
                   #(df_agg['%NFCR']==1)
                   ] = 1
In [33]:
print(df_agg.Target.value_counts(), '\n')
df_agg.Target.value_counts().plot(kind='bar', title='target')
0    147946
1      4588
Name: Target, dtype: int64 

Out[33]:
<AxesSubplot:title={'center':'target'}>

Modeling With DownSamples

In [15]:
df = df_agg
In [16]:
# Down-sample majority class
from sklearn.utils import resample

df_majority = df[df.Target == 0]
df_minority = df[df.Target == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=df_minority.shape[0],
                                   random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Target.value_counts()
Out[16]:
1    4588
0    4588
Name: Target, dtype: int64

Visualization

In [17]:
from pandas_profiling import ProfileReport
prof = ProfileReport(df_downsampled)
prof



Out[17]:

In [35]:
columns = ['Volume_7days', 'Volume_15days',
            'Tempo_Medio_Chat',
            'Tempo_Medio_Email', 'AWT_Chat',
            '%NFCR', '%Insatisfação(CSAT)',
            'CSAT_Rated', 'Plan_value', 
            'Count_checkin', 'Cancel', 'Target']
sns.pairplot(df_downsampled.filter(columns), hue='Target')
Out[35]:
<seaborn.axisgrid.PairGrid at 0x24aac98dc40>

Modeling

Traning and Testing sets

In [18]:
features = ['Volume_7days', 'Volume_15days',
            'Tempo_Medio_Chat',
            'Tempo_Medio_Email', 'AWT_Chat',
            '%NFCR', '%Insatisfação(CSAT)',
            'CSAT_Rated', 'Plan_value', 
            'Count_checkin', 'Cancel']
In [19]:
from sklearn.model_selection import train_test_split
y = df_downsampled.Target
X = df_downsampled.filter(features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

classifier_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
classifier_tree = classifier_tree.fit(X_train, y_train)
# Predicting the Test set results
y_pred_tree = classifier_tree.predict(X_test)
In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score

print(f'Accuracy = {accuracy_score(y_test, y_pred_tree):.2f} \nRecall = {recall_score(y_test, y_pred_tree):.2f}')

cm_rf = confusion_matrix(y_test, y_pred_tree)
plt.title('Confusion Matrix (Desicion Tree)', size=16)
sns.heatmap(cm_rf, annot=True, fmt='.2f', cmap='Blues')
plt.show()
Accuracy = 0.89 
Recall = 0.89
In [25]:
from sklearn import tree
_ = tree.plot_tree(classifier_tree,
                   feature_names=X_train.columns,
                   class_names=['non_critical', 'critical'], # Names of each of the target classes in ascending numerical order.
                   filled=True)

Random Forest

In [26]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 100,
                                       max_depth=4,
                                       criterion = 'entropy', 
                                       random_state = 0)
classifier_rf.fit(X_train, y_train)

# Predicting the Test set results
y_pred_rf = classifier_rf.predict(X_test)

# K-foldcross validation
from sklearn.model_selection import cross_val_score
accuracies_rf = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train, cv = 10, n_jobs = -1)
mean_rf = accuracies_rf.mean()
variance_rf = accuracies_rf.std()
In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score

print(f'Accuracy = {accuracy_score(y_test, y_pred_rf):.2f} \nRecall = {recall_score(y_test, y_pred_rf):.2f}')

cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.title('Confusion Matrix (Random Forest)', size=16)
sns.heatmap(cm_rf, annot=True, fmt='.2f', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Accuracy = 0.90 
Recall = 0.92
In [28]:
#ROC
import scikitplot as skplt #to make things easy
y_pred_proba = classifier_rf.predict_proba(X_test)
skplt.metrics.plot_roc_curve(y_test, y_pred_proba)
plt.title("ROC Curves - Random Forest")
plt.show()
In [29]:
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': classifier_rf.feature_importances_}).\
                    sort_values('importance', ascending = False)
plt.barh(fi.feature,fi.importance)
plt.xlabel("Random Forest Feature Importance")
Out[29]:
Text(0.5, 0, 'Random Forest Feature Importance')

Output Model

In [30]:
x_test = pd.concat([X_test, y_test], axis=1)
df_output = (X_test.assign(Pred=classifier_rf.predict(X_test),
                           Prob=classifier_rf.predict_proba(X_test)[:,-1]))
df_pred = pd.merge(df_agg, df_output[['Pred', 'Prob']], how='inner', left_index=True, right_index=True)
df_pred.iloc[:100,:]
Out[30]:
Requester_ID Month Volume_7days Volume_15days Volume_Monthly ReclameAqui SocialMedia Tempo_Medio_Chat Tempo_Medio_Email AWT_Chat %NFCR %Insatisfação(CSAT) CSAT_Rated Plan_value Count_checkin Cancel Target Pred Prob
5 10016341 2021_01 4 5 5 no no 1305.762750 7878.166667 20.250000 1.00 0.0 0 0.00 5.0 1.0 1 1 0.944032
90 10115473 2021_01 1 1 1 no no 0.000000 16483.916667 0.000000 0.00 0.0 0 119.90 15.0 0.0 0 0 0.126208
399 1072347 2021_01 0 1 1 no no 0.000000 2825.750000 0.000000 0.00 1.0 1 119.90 15.0 0.0 0 0 0.177869
445 10780626 2021_01 1 1 1 no no 0.000000 25347.375000 0.000000 0.00 0.0 0 119.90 15.0 0.0 0 0 0.126208
455 1078765 2021_01 1 4 4 no no 704.383333 21835.625000 12.333333 0.25 0.0 0 85.09 2.0 1.0 1 1 0.929146
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7381 5510981 2021_01 1 1 1 no no 0.000000 5433.250000 0.000000 0.00 0.0 0 1531.64 17.0 0.0 0 0 0.121006
7404 5534147 2021_01 0 1 1 no no 0.000000 3537.250000 0.000000 0.00 0.0 0 4999.00 5.0 0.0 0 0 0.104076
7440 5587752 2021_01 0 4 4 yes no 2688.610500 5405.708333 39.000000 0.25 1.0 1 119.90 15.0 0.0 1 1 0.932023
7514 5687280 2021_01 1 1 1 no no 0.000000 5202.000000 0.000000 0.00 0.0 1 119.90 15.0 0.0 0 0 0.121002
7524 5697571 2021_01 2 2 2 no no 0.000000 7441.416667 0.000000 0.50 0.0 0 119.90 15.0 0.0 0 1 0.564645

100 rows × 19 columns

In [31]:
#df_pred.to_csv('../../Data/Random_Forest_Prediction.csv')
In [ ]:
 
In [ ]: